This tutorial was made to assist the scienfic communication and review process for Iha et al. 2022 (in prep). The pipeline may be discontinuous, but the important commands are presented here.

Nuclear genome assembly and annotation

Workflow

Assembly genome with MaSurCa

Make config file

# quick run configuration file
DATA
PE = pe 350 50 Mp_R1.fastq.gz Mp_R2.fastq.gz
END
PARAMETERS
EXTEND_JUMP_READS=0
GRAPH_KMER_SIZE = auto
USE_LINKING_MATES = 1
USE_GRID=0
GRID_ENGINE=SLURM
GRID_QUEUE=m3tb
LHE_COVERAGE=25
CA_PARAMETERS =  cgwErrorRate=0.15
CLOSE_GAPS=1
NUM_THREADS = 48
JF_SIZE = 250000000000
SOAP_ASSEMBLY=1
FLYE_ASSEMBLY=0
END

Run MaSurca

module load masurca/4.0.5

# The job command(s)

masurca config.txt
./assemble.sh

Total scaffolds: 4713

Mapping raw reads

bowtie/2.3.4
samtools/1.9.0
bbtools/38.37

#Index bowtie2
bowtie2-build --threads 16 scaffolds.fasta masurca

#Run mapping
bowtie2 --no-unal -p 16 -x masurca -1 Mp_R1.fastq.gz -2 Mp_R2.fastq.gz -S masurca.sam &>bowtie.log

#index scaffolds
samtools faidx scaffolds.fasta

#Convert sam to bam
samtools view -@ 16 -bt scaffolds.fasta masurca.sam | samtools sort -@ 16 -o masurca.bam

#coverage info per scaffold
pileup.sh -Xmx30g in=bmasurca.bam ref=scaffolds.fasta out=mapping_stats_masurca.txt

Cleaning genome

Cleaning with taxon classification using Blobtools v1

The filter steps were adapted from Iha et al. 2021

Make custom database

I used non-redundant (NR) database from GenBank.

Download prot.accession2taxid.gz

Get taxonID for the selected lineages:

  • 2 -> bacteria

  • 2157 -> archeae

  • 4751 -> fungi

  • 10239 -> virus

  • 33634 -> Stramenopiles

    • 2696291 -> Ochrophyta
    • 2870 -> Phaeophyceae
    • 2836 -> Bacillariophyta
  • 2763 -> Rhodophyta

  • 3041 -> Chlorophyta

#Using TaxonKit (https://bioinf.shenwei.me/taxonkit/)

# Create a list of taxids
taxonkit list --ids 2,2157,4751,10239,33634,2763,3041 --indent "" > lineages.taxid.txt

# Total taxIDs:
wc -l lineages.taxid.txt

# Retrieving target accessions
pigz -dc prot.accession2taxid.gz | csvtk grep -t -f taxid -P lineages.taxid.txt | csvtk cut -t -f accession.version,taxid | sed 1d > lineages.acc2taxid.txt

cut -f 1 lineages.acc2taxid.txt > lineages.acc.txt

# Target retrieved:
wc -l lineages.acc.txt

Split lineages.acc.txt in many files

Retrieving FASTA from NR - using array in SLURM

#SBATCH --output=array_%A-%a.out
#SBATCH --array=0-1744

# Retrieving FASTA

blastdbcmd -db nr -entry_batch lineages.acc_${SLURM_ARRAY_TASK_ID} -out - | pigz -c > nr_${SLURM_ARRAY_TASK_ID}.db.fa.gz

Concatenate all sequences to create a unique database

cat nr_*.db.fa.gz > nr.db.fa.gz

# Counting sequences:
pigz -dc nr.db.fa.gz | grep '>' | wc -l

Prepare data to Blobtools v1:

blast+/2.12.0
diamond/2.0.4
samtools/1.9.0

#Create database with Uniprot
diamond makedb --in uniprot_ref_proteomes.fasta -d uniprot_ref_proteomes.diamond

#Running diamond to Uniprot
diamond blastx --query scaffolds.fasta --max-target-seqs 10 --sensitive --threads 16 --db uniprot_ref_proteomes.diamond.dmnd --evalue 1e-25 --outfmt 6 --out masurca.vs.uniprot.1e25.diam.out

#Running Blast to Silva
blastn -task megablast -query scaffolds.fasta -db database/Silva/silva.rDNA.fasta -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' -num_threads 16 -evalue 1e-65 -out masurca.vs.silva.1e65.megablast.out

#Running Blast to Custom database
#Making db to custom database
gunzip database/Custom_db/nr.db.fa.gz
diamond makedb -p 16 --in database/Custom_db/nr.clean.db.fa -d database/Custom_db/nr.db.fa.diamond

#Running Diamond to Custom database
diamond blastx --query scaffolds.fasta --max-target-seqs 10 --sensitive --threads 16 --db database/Custom_db/nr.db.fa.diamond.dmnd --evalue 1e-20 --outfmt 6 --out masurca.vs.customdb.1e20.diamond.out

#taxify for Uniprot
blobtools taxify -f masurca.vs.uniprot.1e25.diam.out -m database/Uniprot/uniprot_ref_proteomes.taxids -s 0 -t 2

#taxify for Silva
blobtools taxify -f masurca.vs.silva.1e65.megablast.out -m database/Silva/silva_ref_rRNA.taxids -s 0 -t 2

#taxify for Custom database
blobtools taxify -f masurca.vs.customdb.1e20.diamond.out -m database/Custom_db/lineages.acc2taxid.txt -s 0 -t 1

#Concat all hits
cat masurca.vs.uniprot.1e25.diam.taxified.out masurca.vs.silva.1e65.megablast.taxified.out masurca.vs.customdb.1e20.diamond.taxified.out > hits.tsv

Run Blobtools

blobtools create -i scaffolds.fasta -t hits.tsv --nodes blobtools/nodes/nodes.dmp --names blobtools/nodes/names.dmp -b masurca.bam -o masurca_blobtools

#Running Blobtools view
blobtools view -i masurca_blobtools.blobDB.json -o masurca_blobview

#Running Blobtool plot
blobtools plot -i  masurca_blobtools.blobDB.json

BlobTools plot

Removing Eukaryotes scaffolds that are distant of Stramenopiles

cut -f6 masurca_blobview.blobdb.blobDB.bestsum.table.txt | sort -u > 00_taxa_to_remove
#manipulate on vi

Kept only the bold taxa:

Actinobacteria Arthropoda Bacillariophyta Bacteria-undef Bacteroidetes Balneolaeota Basidiomycota Candidatus Tectomicrobia Chloroflexi Chlorophyta Cyanobacteria Eukaryota-undef Evosea Firmicutes Gemmatimonadetes no-hit Oomycota Planctomycetes Proteobacteria Streptophyta undef Verrucomicrobia

grep -f 00_taxa_to_remove masurca_blobview.blobdb.blobDB.bestsum.table.txt | cut -f1 > 00_scaffolds_to_remove_byTaxa.ID

19 scaffolds to remove

grep '>' scaffolds.fasta | grep -v -w -f 00_scaffolds_to_remove_byTaxa.ID | sed 's/>//' > 00_scaffolds_to_keep.ID

seqtk subseq scaffolds.fasta 00_scaffolds_to_keep.ID > 00_scaffolds.clean.fasta
#4694 scaffolds

Get scaffolds that matched with Ectocarpus.

Taxonomic ID: 2880 Ectocarpus siliculosus 867726 Ectocarpus sp. CCAP 1310/34

grep -w -f 00_all_scaffolds.ID ../blobtools/masurca.vs.customdb.diamond.blastx.out | grep -e '2880' -e '867726' | cut -f1 | sort -u > 01_all_Ectocarpus_scaffolds.ID

grep -w -f 01_all_noEctocarpus_scaffolds.ID masurca_blobview.blobdb.blobDB.bestsum.table.txt | grep 'Eukaryota' | cut -f1 > 01_eukaryota.ID

#check this IDs in Uniprot
grep -f 01_eukaryota.ID ../blobtools/masurca.vs.uniprot.1e25.diam.taxified.out

#Total 1355 scaffolds matched with Ectocarpus

# Scaffols do not match with Ectocarpus
grep -v -w -f 01_all_Ectocarpus_scaffolds.ID 00_all_scaffolds.ID > 01_noEctocarpus.ID
# 3339 do not match with Ectocarpus

Using transcriptomes to clean the genome

minimap2 -ax splice -C 5 --splice-flank=no --secondary=no -t 16 scaffolds.cleantaxaless1000l.fasta all_transcripts.fasta > mapped_transcripts.sam

#transform in paf format to get CIGAR string info for introns
./paftools.js sam2paf mapped_transcripts.sam > mapped_transcripts.paf

CIGAR string explained:

cg:Z:130M11864N224M3939N188M –> N means introns!

2586 scaffold presents introns

Calculate outliers Script on R –> Outrliers.R

Final genome: scaffolds.clean.fasta

3864 scaffolds

Assembly Transcriptomes

We used two transcriptomes downloaded from GenBank

Macrocystis integrifolia transcriptome: PRJNA322132

Reference: https://link.springer.com/article/10.1007/s13205-018-1204-4

RNAseq for the brown alga Macrocystis pyrifera: PRJNA353611

Download data from GenBank

prefetch <SRA_accession_number>
fastq-dump --split-3 <SRA_accession_number> #Descompress SRA and split in pair-ends 

Check quality with FastQC and trim reads with Trimmomatic.

Two modes were used: de novo and “genome-guided” Trinity

De novo

Trinity \
        --seqType fq \
        --max_memory 250G \
        --samples_file samples.txt \
        --CPU 20 \
        --output trinity_out_dir

Genome guided

module load star/2.7.9a
module load samtools/1.12
module load trinity/2.12.0

#Genome index
STAR --runThreadN 16 --runMode genomeGenerate --genomeDir genome --genomeFastaFiles scaffolds.clean.fasta --genomeSAindexNbases 11

#Mapping reads
STAR --runThreadN 16 --genomeDir genome --readFilesManifest manifest.tsv

#Sam to bam
samtools view -@ 8 -b -t scaffolds.clean.fasta Aligned.out.sam | samtools sort -@ 8 -o Aligned.out.bam

#assembly
Trinity --genome_guided_bam Aligned.out.bam \
        --genome_guided_max_intron 10000 \
        --max_memory 250G \
        --CPU 20
PRJNA322132 PRJNA353611
De novo Trinity-DNP.fasta Trinity-DNR.fasta
124436 310167
Genome Guide Trinity-GGP.fasta Trinity-GGR.fasta
14586 34452

Concat all transcriptomes.

Genome annotation

Workflow based on Chen et al. 2019

https://github.com/TimothyStephens/Dinoflagellate_Annotation_Workflow

Clean Transcriptomes

Univec database

Univec will be used to clean transcriptomes

#blast+/2.12.0
#pasa/2.5.1

mkdir UNIVEC/
cd UNIVEC/

wget ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec
wget ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec_Core
wget ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/README.uv


makeblastdb  -in UniVec -dbtype nucl
makeblastdb  -in UniVec_Core -dbtype nucl

seqclean all_transcripts.fasta -v UNIVEC/UniVec_Core -c 4

#Cleaned transcript: all_transcripts.fasta.clean
# 483627 transcripts

Repeat analysis


#repeatmasker/4.1.2p1
#rmblast/2.11.0
#repeatmodeler/2.0.2a


### If need TRF ####
#Download Tandem Repeats Finder
#https://tandem.bu.edu/trf/trf409.linux64.download.html
mv trf409.legacylinux64 trf
chmod +x trf
mv trf ~/bin
####################

export GENOME_NAME=scaffolds.clean.fasta
export GENOME_SIZE=51923561

#Running Repeat Modeler

BuildDatabase -name ${GENOME_NAME}_db -engine ncbi ${GENOME_NAME} > BuildDatabase.out
RepeatModeler -engine ncbi -pa ${SLURM_NTASKS} -database ${GENOME_NAME}_db 1> RepeatModeler.sdtout 2> RepeatModeler.sdterr

# Combine repeat modeler and repeat masker libraries.
cp RM_*/consensi.fa* .
cat consensi.fa.classified RepeatMasker.lib > ${GENOME_NAME}_CombinedRepeatLib.lib

#### Run Repeat Masker

RepeatMasker -lib ${GENOME_NAME}_CombinedRepeatLib.lib -e ncbi -gff -x -no_is -a -pa ${SLURM_NTASKS} ${GENOME_NAME} 1> RepeatMasker.sdtout 2> RepeatMasker.sdterr


# SoftMask Genome
maskFastaFromBed -soft -fi ${GENOME_NAME} -fo ${GENOME_NAME}.softmasked -bed ${GENOME_NAME}.out.gff

# Get repeat features in gff3 format (used for EVM later on)
rmOutToGFF3.pl ${GENOME_NAME}.out > ${GENOME_NAME}.out.gff3

# Model repeat landscape (not needed for annotation)
calcDivergenceFromAlign.pl -s ${GENOME_NAME}.divsum ${GENOME_NAME}.align
createRepeatLandscape.pl -div ${GENOME_NAME}.divsum -g ${GENOME_SIZE} > ${GENOME_NAME}.html

PASA

module load perl/5.32.1
module load blat/3.6
module load samtools/1.12
module load pasa/2.5.1

# The job command(s):
export PASA=/apps/pasa/2.5.1/
export GENOME_NAME=scaffolds.clean.fasta
export GENOME_SIZE=51923561
export MAX_INTRON_LENGTH=70000
export SCRIPTS=/scratch1/iha002/WGS_Illumina_AGRF/AGRF_CAGRF21056777_HGHMYDRXY/annotation/Workflow/Dinoflagellate_Annotation_Workflow/scripts

Launch_PASA_pipeline.pl -c alignAssembly.config -C -R -g ${GENOME_NAME} --MAX_INTRON_LENGTH ${MAX_INTRON_LENGTH} --ALIGNERS blat --CPU ${SLURM_NTASKS} -T -t all_transcripts.fasta.clean -u all_transcripts.fasta > pasa.log 2>&1

perl $PASA/scripts/pasa_asmbls_to_training_set.dbi --pasa_transcripts_fasta ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta --pasa_transcripts_gff3 ${GENOME_NAME}_pasadb.sqlite.pasa_assemblies.gff3

perl ${SCRIPTS}/GeneStats.pl ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.cds ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.genome.gff3 ${GENOME_NAME} > ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.cds.stats

Blast PASA output to NR

mkdir blast2db

#Link proteins predicted
ln -s scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep .

## Get sequences which are type:complete.
grep '>' ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.pep | awk '{print $1}' | sed -e 's/>//' > Complete_Sequences.ids

## Get all seq IDs that have only  one CDS.
awk '$3=="CDS"{print $0}' ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.genome.gff3 | awk '{ print $9}' | sed -e 's/.*Parent=\(.*\)/\1/' | sort | uniq -c | awk '{ if($1==1) print $2 }' > Single_CDS_Genes.ids

## Get seq IDs which have coords on the genome.
awk '$3=="mRNA"{print $0}' ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.genome.gff3 | awk '{ print $9 }' | sed -e 's/ID=\(.*\);Parent.*/\1/' > Genes_with_genome_coords.ids

## Filter IDs

## Get seq IDs that are NOT Single Exon genes, have genome coords, and are type complete. 
## The later GoldenGenes stage will filter these genes anyway so we might as well filter them out now before the intensive blast stage.

python2 ../Dinoflagellate_Annotation_Workflow/scripts/filter_ids.py -i Complete_Sequences.ids -o Complete_Sequences.ids.filtered -k Genes_with_genome_coords.ids -r Single_CDS_Genes.ids

## Get pep sequences in filtered ID list
xargs samtools faidx ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.pep < Complete_Sequences.ids.filtered > ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered

mkdir fasta_split/

seqkit split2 -s 50 scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered
ls -1 fasta_split/*.filtered > files2run.txt

Run BLAST array

#SBATCH --array=0-48

# The modules to load:
module load blast+/2.12.0
module load bioref/default

# The job command(s):

if [ -d "$INDIR" ]
then
        SAMPLES=( `ls -1 ${INDIR}/*.filtered | sed -E 's/.+\/(.+)\.filtered/\1/' ` );

        if [ ! -z "$SLURM_ARRAY_TASK_ID" ]
        then
                i=$SLURM_ARRAY_TASK_ID
                blastp -query ${INDIR}/${SAMPLES[$i]}.filtered -db nr  -out ${INDIR}/${SAMPLES[$i]}.blastp.outfmt6 -num_threads ${SLURM_NTASKS} -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
        else
                echo "Error: Missing array index as SLURM_ARRAY_TASK_ID"
        fi
else
        echo "Error: Missing input file directory as --export env INDIR or doesn't exist"
fi

Sorting results

cat FASTA_SPLIT/*.outfmt6 > blastp.outfmt6

cat blastp.outfmt6 | awk -F "\t" '$11<1e-20' | awk -F "\t" '{ if (( (($8-$7)+1) / $13) > 0.8) {print $1}}' | sort | uniq | xargs samtools faidx scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered > scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage_onlyquery.faa  #### 1624 sequences

Transposon-PSI

Download Transposon-PSI

Download blastall. The program use formatdb.

Create a ~/.ncbirc on home:

[NCBI]
data=<full-path-to-TransposonPSI-data>/Transposon-PSI/blast-2.2.26/data

perl TransposonPSI_08222010/transposonPSI.pl scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.faa prot

cat scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.faa.TPSI.allHits | awk '{ print $5 }' | sort | uniq > TransposonPSI.hit.seq.ids

CD-HITs


mkdir CDHITS
cd CDHITS/

grep '>' scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.faa | sed -e 's/>//' > Seqs_from_blast.ids

cp ../Dinoflagellate_Annotation_Workflow/scripts/filter_ids.py .

python2 filter_ids.py -i Seqs_from_blast.ids -o Seqs_from_blast.ids.filtered -r ../Transposon-PSI/TransposonPSI.hit.seq.ids

module load samtools/1.12

xargs samtools faidx scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.faa < Seqs_from_blast.ids.filtered > scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.final.faa


module load cd-hit/4.8.1
cd-hit \
   -i scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.final.faa \
   -o scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.final.faa.cdhit75 \
   -c 0.75 \
   -n 5 \
   -T 16 \
   -M 30000


grep '>' scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.final.faa.cdhit75 | sed -e 's/>//' > scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.final.faa.cdhit75.ids

Golden Genes


git clone https://github.com/genomecuration/JAMg.git
date > JAMg/date_retrieved.txt

mkdir Prepare_Golden_Genes

cp JAMg/date_retrieved.txt Prepare_Golden_Genes/
cp JAMg/bin/prepare_golden_genes_for_predictors.pl Prepare_Golden_Genes/prepare_golden_genes_for_predictors.pl
cp JAMg/bin/run_exonerate.pl Prepare_Golden_Genes/run_exonerate.pl
cp -r JAMg/PerlLib Prepare_Golden_Genes/

export PERL5LIB=$PERL5LIB:"/home/iha002/iha002/WGS_Illumina_AGRF/AGRF_CAGRF21056777_HGHMYDRXY/annotation/Prepare_Golden_Genes/PerlLib"

#Install BIO
perl -MCPAN -Mlocal::lib -e shell
cpan[1]> install Bio::Perl

#Install cdbfasta
git clone https://github.com/gpertea/cdbfasta.git
make
cd ~/bin
ln -s ../iha002/WGS_Illumina_AGRF/AGRF_CAGRF21056777_HGHMYDRXY/annotation/Prepare_Golden_Genes/cdbfasta/cdbfasta
ln -s ../iha002/WGS_Illumina_AGRF/AGRF_CAGRF21056777_HGHMYDRXY/annotation/Prepare_Golden_Genes/cdbfasta/cdbyank

## Check it works
module load perl/5.32.1
module load blast+/2.12.0
module load gmap/20210825
module load samtools/1.12
module load exonerate/2.2.0
module load augustus/3.4.0
module load snap-korflab/211201
module load trinity/2.12.0
module load emboss/6.6.0


export PERL5LIB=$PERL5LIB:"<PATH-to-PerlLIB>/Prepare_Golden_Genes/PerlLib"

perl prepare_golden_genes_for_predictors.pl \
        -genome scaffolds.clean.fasta.masked \
        -softmasked scaffolds.clean.fasta.softmasked \
        -no_gmap -same_species -intron 7000 -cpu 16 -norefine -complete -no_single -verbose \
        -pasa_gff *.assemblies.fasta.transdecoder.gff3 \
        -pasa_peptides *.assemblies.fasta.transdecoder.pep \
        -pasa_cds *.assemblies.fasta.transdecoder.cds \
        -pasa_genome *.assemblies.fasta.transdecoder.genome.gff3 \
        -pasa_assembly *.assemblies.fasta

Running golden genes script

mkdir Golden_genes/

ln -s ../repeat/scaffolds.clean.fasta.masked .
ln -s ../repeat/scaffolds.clean.fasta.softmasked .
ln -s ../CDHITS/scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.final.faa.cdhit75.ids CD-HIT.ids.txt
ln -s ../../Prepare_Golden_Genes/prepare_golden_genes_for_predictors.pl .

## Get PASA output for GoldenSet.
## 
## This script should get just the IDs given in the CD-HIT.ids.txt file in
## the same format as you would expect from PASA. This seems to be required by the 
## prepare_golden_genes script for some reason.
## This script might not work with different versions of PASA if the seq names are in a different format.
python2 ../Dinoflagellate_Annotation_Workflow/scripts/get_PASA_for__prepare_golden_genes.py --ids CD-HIT.ids.txt --pasa_assembly ../PASA/*.assemblies.fasta --pasa_cds ../PASA/*.assemblies.fasta.transdecoder.cds --pasa_peptides ../PASA/*.assemblies.fasta.transdecoder.pep --pasa_gff ../PASA/*.assemblies.fasta.transdecoder.gff3 --pasa_genome ../PASA/*.assemblies.fasta.transdecoder.genome.gff3

Run golden_genes.script
# I had to install Parafly

GeneMark-ES

perl gmes_linux_64/gmes_petap.pl --ES --cores 16 --format GFF3 --v --sequence scaffolds.clean.fasta.masked 1>&2> genemark.log

perl ${MAKER}/genemark_gtf2gff3 genemark.gtf > genemark.gff3

SNAP

mkdir SNAP

ln -s ../repeat/scaffolds.clean.fasta.softmasked .
ln -s ../Golden_genes/final_golden_genes.gff3.nr.golden.zff .

module load samtools/1.12
module load snap-korflab/211201

grep '>' final_golden_genes.gff3.nr.golden.zff | sed 's/>\(.*\)/\1/' | xargs samtools faidx scaffolds.clean.fasta.softmasked > snap.fasta

fathom final_golden_genes.gff3.nr.golden.zff snap.fasta -gene-stats
#118 sequences
#0.506642 avg GC fraction (min=0.465218 max=0.544290)
#142 genes (plus=77 minus=65)
#0 (0.000000) single-exon
#142 (1.000000) multi-exon
#190.731445 mean exon (min=3 max=1576)
#1170.067871 mean intron (min=90 max=6611)

fathom final_golden_genes.gff3.nr.golden.zff snap.fasta -validate > validate.txt

fathom final_golden_genes.gff3.nr.golden.zff snap.fasta -categorize 1000

cat uni.ann wrn.ann > com.ann
cat uni.dna wrn.dna > com.dna

fathom -export 1000 -plus com.ann com.dna

forge export.ann export.dna

hmm-assembler.pl -c 0.000 scaffolds.clean.fasta . > scaffolds.clean.fasta.snap.hmm

mkdir PREDICT; cd PREDICT/

ln -s ../scaffolds.clean.fasta.softmasked .

snap ../scaffolds.clean.fasta.snap.hmm scaffolds.clean.fasta.softmasked -lcmask -quiet 1> stdout.snap 2> stderr.snap

perl ../../Dinoflagellate_Annotation_Workflow/scripts/SNAP_output_to_gff3.pl stdout.snap scaffolds.clean.fasta.softmasked 1> snap.gff3 2> snap.gff3.strerr

AUGUSTUS

I didn’t use Training mode

mkdir PREDICTION

module load augustus/3.4.0

ln -s ../../repeat/scaffolds.clean.fasta.softmasked .

cp ../../Dinoflagellate_Annotation_Workflow/scripts/fasta-splitter.pl .
#Had to install File::Util - perl module

perl fasta-splitter.pl --n-parts 100 --out-dir FASTA_SPLIT ${GENOME_NAME}.softmasked

augustus --softmasking=1 --gff3=on --UTR=off --codingseq=on --protein=on --exonnames=on --species=${SPECIES} ${INDIR}/${SAMPLES[$i]}.softmasked > ${INDIR}/${SAMPLES[$i]}.augustus.out

cat FASTA_SPLIT/*.out | join_aug_pred.pl > augustus.gff3
getAnnoFasta.pl augustus.gff3
#augustus3.aa
#augustus3.codingseq

#Install Evidence modeler
export PERL5LIB=$PERL5LIB:"/home/iha002/sw/EVidenceModeler-1.1.1/PerlLib"
cp ~/sw/EVidenceModeler-1.1.1/EvmUtils/misc/augustus_GFF3_to_EVM_GFF3.pl .

perl augustus_GFF3_to_EVM_GFF3.pl augustus.gff3 | sed '/^$/d' > augustus.cleaned.gff3
perl GeneStats.pl augustus3.codingseq augustus.cleaned.gff3 ../../scaffolds.clean.fasta > augustus.cleaned.gff3.stats

Evidence Modeler (Finally!!!)

Prepare data:

mkdir EVIDENCE_MODELER

cd EVIDENCE_MODELER/

ln -s ../scaffolds.clean.fasta .
ln -s ../GeneMark/genemark.gff3 .
ln -s ../SNAP/PREDICT/snap.gff3 .
ln -s ../AUGUSTUS/augustus.cleaned.gff3 .
ln -s ../PASA/scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.genome.gff3 .
ln -s ../repeat/scaffolds.clean.fasta.out.gff .

grep -v "^#" genemark.gff3 > abinitio_gene_predictions.gff3
grep -v "^#" snap.gff3 | sed '/^$/d' | awk '{print $1 "\tSNAP\t" $3 "\t" $4 "\t" $5 "\t" $6 "\t" $7 "\t"$8 "\t" $9}' >> abinitio_gene_predictions.gff3
cat augustus.cleaned.gff3 >> abinitio_gene_predictions.gff3
sed '/^$/d' scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.genome.gff3 >> abinitio_gene_predictions.gff3

Make weights.txt

ABINITIO_PREDICTION     GeneMark.hmme 2
ABINITIO_PREDICTION     SNAP    2
ABINITIO_PREDICTION     Augustus        6
OTHER_PREDICTION        transdecoder    10

Run Evidence Modeler

mkdir PARTITIONS_EVM
cd PARTITIONS_EVM/

ln -s ../abinitio_gene_predictions.gff3 .

partition_EVM_inputs.pl --genome scaffolds.clean.fasta --gene_predictions abinitio_gene_predictions.gff3 --segmentSize 50000000 --overlapSize 10000 --partition_listing partitions_list.out

write_EVM_commands.pl --genome scaffolds.clean.fasta --gene_predictions abinitio_gene_predictions.gff3 --weights weights.txt --output_file_name evm.out --partitions partitions_list.out > commands.list

ParaFly -v -CPU 16 -c commands.list -failed_cmds commands.list.failed

recombine_EVM_partial_outputs.pl --partitions partitions_list.out --output_file_name evm.out

convert_EVM_outputs_to_GFF3.pl --partitions partitions_list.out --output evm.out --genome /scratch2/iha002/Macrocystis_annotation/scaffolds.clean.fasta

cd ../

find PARTITIONS_EVM/ -name evm.out.gff3 -exec cat '{}' ; > scaffolds.clean.fasta.evm.gff3


# Get CDS and Proteins.

#sed 's@EVM.evm.TU@EVM.evm.TU@' will remove ^\ (file seperator) character from EVM^\evm.TU to EVM.evm.TU

gff3_file_to_proteins.pl scaffolds.clean.fasta.evm.gff3 scaffolds.clean.fasta CDS | sed 's@EVM.evm.TU@EVM.evm.TU@' > scaffolds.clean.fasta.evm.cds.fna

gff3_file_to_proteins.pl scaffolds.clean.fasta.evm.gff3 scaffolds.clean.fasta prot | sed 's@EVM.evm.TU@EVM.evm.TU@' > scaffolds.clean.fasta.evm.protein.faa

DONE!


Genotyping and genetic structure

Genotyping assembly with genome reference

Checking quality

After the Illumina sequencing, the samples were demultiplexed in fastq/ directory. All samples had the quality checked with FastQC and visualized with MultiQC.

module load fastqc/0.11.9

cd fastq/

for i in *fq.gz
do
        fastqc -t 16 $i
done

multiqc .

The samples A7 and A8 were removed from the genotyping assembly due to low coverage.

Prepare sample for gstacks

We used assembled genome as reference for SNPs calling. We had to mapped the reads of each individual against the genome before running STACKS. https://catchenlab.life.illinois.edu/stacks/manual/.

The assembly was performed on HPC [Petrichor] (https://www.csiro.au/en/research/technology-space/it/petrichor-hpc) that is based on SLURM. We created one script to run BWA and Samtools for each sample. First, create the base script:

#!/bin/bash

# The name of the job:
#SBATCH --job-name="bwa"

# Multithreaded (SMP) job: must run on one node
#SBATCH --nodes=1
# Maximum number of tasks/CPU cores used by the job:
#SBATCH --ntasks=4

# The amount of memory in megabytes per process in the job:
#SBATCH --mem=20G

# The maximum running time of the job in days-hours:mins:sec
#SBATCH --time=1-0:0:00

# The modules to load:
module load bwa/0.7.17
module load samtools/1.12

# The job command(s)

Later, create one script for each sample:

mkdir alignment
mkdir alignment/bam

for fastq in $(ls fastq/*.gz)
do
        input=$(basename $fastq | sed 's/-1A.*//')
        echo ${input}
        cat bwa.script > bwa.${input}.script
        echo "bwa mem -t 4 scaffolds.clean.fasta ${fastq} > alignment/${input}.sam" >> bwa.${input}.script
        echo "samtools view -@ 4 -b -T scaffolds.clean.fasta alignment/${input}.sam | samtools sort -@ 4 -o alignment/bam/${input}.bam" >> bwa.${input}.script
done

Send all scripts to queue:

for i in bwa.*.script ; do sbatch $i ; done

With all indivuals mapped against the genome, we performed the SNPs calling with STACK, using population mapping file with “population” and “region”:

gstacks -I alignments/bam/  -M pop_map -O gstacks2 -t 8
populations -P gstacks2 -M pop_map -r 0.65 --vcf --genepop --fstats --smooth --hwe -t 8

The STACK output are saved on gstacks2 directory.

Checking vcf file from STACKS-Populations

This step is a mix of R and bash commands. We used VCFtools (bash) to calculate relevant information in the vcf file and R to make plots with those information:

Using VCFTools

Calculate allele frequency
vcftools --vcf gstacks2/populations.snps.vcf --freq2 --out gstacks
Calculate mean depth per individual
vcftools --vcf gstacks2/populations.snps.vcf --depth  --out gstacks
Calculate mean depth per site
vcftools --vcf gstacks2/populations.snps.vcf --site-mean-depth --out gstacks
Calculate proportion of missing data per individual
vcftools --vcf gstacks2/populations.snps.vcf --missing-indv --out gstacks
Calculate proportion of missing data per site
vcftools --vcf gstacks2/populations.snps.vcf --missing-site --out gstacks

R

library(vcfR)
library(tidyverse)
library(ggpubr)
library(poppr)
library(RColorBrewer)
library(tidyr)
library(ggpubr)
library(reshape2)

vcf <- read.vcfR("gstacks2/populations.snps.vcf", verbose = FALSE)
show(vcf)
## ***** Object of Class vcfR *****
## 43 samples
## 2232 CHROMs
## 21,051 variants
## Object size: 46.4 Mb
## 30.21 percent missing data
## *****        *****         *****
#Allele frequency and Minimun allele frequency (MAF)
freq <- read_delim("gstacks2/gstacks.frq", delim = "\t", col_names = c("CHROM", "POS", "N_ALLELES", "N_CHR", "a1" ,"a2"), skip = 1)
freq$maf <- freq %>% select(a1, a2) %>% apply(1, function(z) min(z))
summary(freq$maf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01163 0.02273 0.05556 0.14746 0.25000 0.50000
f <- ggplot(freq, aes(maf)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)

#Depth per individual
depth_sample <- read_delim("gstacks2/gstacks.idepth", delim = "\t")
d <- ggplot(depth_sample, aes(INDV,MEAN_DEPTH)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

#Depth per variant
depth_var <- read_delim("gstacks2/gstacks.ldepth.mean", delim = "\t")
summary(depth_var$MEAN_DEPTH)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    1.000    5.987   11.163   41.514   37.638 4024.980
dv <- ggplot(depth_var, aes(MEAN_DEPTH)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + theme_light() + xlim(0,500)

#Missing per individual
miss_ind <- read_delim("gstacks2/gstacks.imiss", delim = "\t")
m <- ggplot(miss_ind, aes(INDV, F_MISS)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

#Missing per variant
miss_var <- read_delim("gstacks2/gstacks.lmiss", delim = "\t")
summary(miss_var$F_MISS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.02326 0.16279 0.30211 0.58139 0.90698
mv <- ggplot(miss_var, aes(F_MISS)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)

ggarrange(f, d, dv, m, mv, 
          ncol = 2, nrow = 3)

Filtering SNPs using VCFTools

vcftools --vcf gstacks2/populations.snps.vcf --maf 0.02 --max-missing 0.6 --min-meanDP 5 --max-meanDP 250 --minDP 5 --maxDP 250 --recode --out gstacks2/populations.filtered
Checking filtered VCF on R
vcf <- read.vcfR("gstacks2/populations.filtered.recode.renamed.vcf", verbose = FALSE)
show(vcf)
## ***** Object of Class vcfR *****
## 43 samples
## 1505 CHROMs
## 5,911 variants
## Object size: 19.7 Mb
## 0 percent missing data
## *****        *****         *****

Genetic population analysis

Workflow based on Grunwald lab tutorials and adegenet tutorial for DAPC

First, import data attributes.

pop.data <- read_delim("gstacks2/pop_map2.renamed", delim = "\t", col_names = c("SampleID", "Population", "Region", "Phenotype"))
head(pop.data)
## # A tibble: 6 x 4
##   SampleID Population Region Phenotype  
##   <chr>    <chr>      <chr>  <chr>      
## 1 F3       F          SOUTH  NO_TOLERANT
## 2 F4       F          SOUTH  NO_TOLERANT
## 3 F5       F          SOUTH  NO_TOLERANT
## 4 F6       F          SOUTH  NO_TOLERANT
## 5 F7       F          SOUTH  NO_TOLERANT
## 6 F8       F          SOUTH  NO_TOLERANT

Check if all the samples in the VCF and the population data frame are included

all(colnames(vcf@gt)[-1] == pop.data$SampleID)
## [1] TRUE

Convert to genlight object. The genlight object was used to follow analysis.

#change order of populations
pop.data$Population <- factor(pop.data$Population, levels=c("A", "B", "C", "D", "E", "F"))

gl <- vcfR2genlight(vcf)
pop(gl) <- pop.data$Population
head(gl)
##  /// GENLIGHT OBJECT /////////
## 
##  // 6 genotypes,  5,911 binary SNPs, size: 599.1 Kb
##  5462 (15.4 %) missing data
## 
##  // Basic content
##    @gen: list of 6 SNPbin
## 
##  // Optional content
##    @ind.names:  6 individual labels
##    @loc.names:  5911 locus labels
##    @chromosome: factor storing chromosomes of the SNPs
##    @position: integer storing positions of the SNPs
##    @pop: population of each individual (group size range: 6-6)
##    @other: a list containing: elements without names
rm(vcf, pop.data)

The first question we want to answer is “how many groups are in the our samples?”. Although we collected the samples from different localities, this do not means each site is a true population. We will infer the groups/cluster using K-means approach and Discriminant analysis of principal components (DAPC) Jombart et al. 2010

PCA

First, quick look at PCA.

cols <- brewer.pal(n = nPop(gl), name = "Dark2")

pca <- glPca(gl, nf = 2)
#nf = an integer indicating the number of principal components to be retained
barplot(100*pca$eig/sum(pca$eig), col = heat.colors(50), main = "PCA Eigenvalues")
title(ylab="Percent of variance\nexplained", line = 2)
title(xlab="Eigenvalues", line = 1)

The plot indicates that we can retain the first 2 PCAs that explain ~39% of the variance

pca.scores <- as.data.frame(pca$scores)
pca.scores$pop <- pop(gl)

set.seed(9)
ggplot(pca.scores, aes(x=PC1, y=PC2, colour=pop)) + 
  geom_point(size=2) + 
  stat_ellipse(level = 0.9, size = 0.5) + 
  scale_color_manual(name="Populations", values = cols) + 
  geom_hline(yintercept = 0) + 
  geom_vline(xintercept = 0) + 
  theme_bw()

The first PC1 (high Eigenvalues) indicate that south populations are different from north populations. The PC2 differentiated HI from the other south populations.

“However, PCA lacks some essential features for investigating the genetic structure of biological populations. First, it does not provide a group assessment, and would require a priori definition of clusters to study population structures. But even then, PCA would not be appropriate to obtain a clear picture of between-population variation (Figure 1). PCA aims to summarize the overall variability among individuals, which includes both the divergence between groups (i.e., structured genetic variability), and the variation occurring within groups (‘random’ genetic variability). To assess the relationships between different clusters, an adequate method should focus on between-group variability, while neglecting within-group variation.” Jombart et al. 2010

We used find.clusters to determined the groups in our samples and DAPC to determined the population structure. Reference. Good practice for using DAPC were based on Miller et al. 2020

K-means

#Set max number of clusters (K)
maxK <- 10

#Create an empty matrix
myMat <- matrix(nrow=10, ncol=maxK)
colnames(myMat) <- 1:ncol(myMat)

#Find clusters and save BIC values in a matrix
for(i in 1:nrow(myMat)){
  grp <- find.clusters(gl, n.pca=40, choose.n.clust = FALSE, max.n.clust = maxK, criterion = "diffNgroup")
  myMat[i,] <- grp$Kstat
}

#Make long data
my_df <- melt(myMat)
colnames(my_df)[1:3] <- c("Group", "K", "BIC")
my_df$K <- as.factor(my_df$K)

# Plot cluster inference results
Kplot <- ggplot(my_df, aes(K, BIC)) + geom_boxplot() + theme_bw() + xlab("Number of groups (K)")
Kplot

The results indicate that our samples are structured in 2 or 3 clusters (lower BIC values). However, the best Kstats indicate 2 clusters.

We can compared the original populations with the inferred clusters:

table(pop(gl), grp$grp)
##    
##     1 2
##   A 0 6
##   B 0 8
##   C 0 7
##   D 8 0
##   E 8 0
##   F 6 0
table.value(table(pop(gl), grp$grp), col.labels = paste("Inferred_cluster", 1:6), row.labels = levels(pop(gl)))

The plot shows that south and north populations are grouped in two distinct clusters.

DAPC

Based on the PCA results, we retained two components (n.pca = 2) and two DA eigenvalues (n.da = 2).

dapc <- dapc(gl, n.pca = 2, n.da = 2)
scatter(dapc, scree.da=TRUE, posi.da = "bottomright", col = cols, cex = 3, scree.pca = TRUE)

Create STRUCTURE-like plot

#Create composition plot
dapc.results <- as.data.frame(dapc$posterior)
dapc.results$pop <- pop(gl)
dapc.results$indNames <- rownames(dapc.results)
dapc.results <- pivot_longer(dapc.results, -c(pop, indNames))
head(dapc.results, n = 6)
## # A tibble: 6 x 4
##   pop   indNames name      value
##   <fct> <chr>    <chr>     <dbl>
## 1 F     F3       A     7.58e- 73
## 2 F     F3       B     7.01e-121
## 3 F     F3       C     6.20e-109
## 4 F     F3       D     2.03e- 85
## 5 F     F3       E     9.03e-  9
## 6 F     F3       F     1.00e+  0
colnames(dapc.results) <- c("Original_Pop","Sample","Assigned_Pop","Posterior_membership_probability")

t <- ggplot(dapc.results, aes(x=Sample, y=Posterior_membership_probability, fill=Assigned_Pop)) + 
  geom_bar(stat = 'identity') +
  scale_fill_manual(name="Populations", values = cols) +
  facet_grid(~Original_Pop, scales = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) +
  labs(y="Posterior membership probability", x=NULL)
t

DONE!


Genotyping associated with warm-tolerance

Assembly RAD-seq de novo for Group 1 and Group 2

As referenced genome assembly resulted in few SNPs and there was no SNP related to therm-tolerance among therm-tolerant samples X non-therm-tolerants, we perfomed another approach to i) get more SNPs, ii) match the number of individuals therm-tolerant and not tolerants; and iii) exclude low deep coverage sequencing samples.

Two groups including 7 therm-tolerant and changing the 7 no tolerant were created to check if the SNPs related to the phenotype is not random.

The 7 therm-tolerant individuals are the same, but the no therm were random selected respecting the proportion of north and south, and removing the low deep.

Region Therm tolerant NO therm tolerant Group 1 NO therm tolerant Group 2
N C2 A1 C4
N C3 B8 C6
N C8 B2 B5
N B7 C5 A2
S E5 E3 F6
S E6 E4 E2
S D2 D6 D5

De novo assembly with Stacks

Stacks v.2.60

In shell script:

mkdir group1 group2

mkdir group1/fastq group2/fastq #mv fastq files for all individuals of each group to respective group directory

#Same script for each group
denovo_map.pl --samples fastq --popmap pop_map -o stacks -M 4 -r 3 -p 2
populations -P stacks -M pop_map --min-samples-per-pop 0.65 --min-populations 2 --vcf --genepop --structure --fstats --hwe -t 8

Analysis of populations.snps.vcf

First analysis using VCFtools made for both Groups. Inspired on https://speciationgenomics.github.io/filtering_vcfs/.

VCFtools - 0.1.16

Analysis Group 1

  1. Calculate allele frequency
vcftools --vcf gwas/populations1.snps.vcf --freq2 --out group1 --max-alleles 2
  1. Calculate mean depth per individual
vcftools --vcf gwas/populations1.snps.vcf --depth --out group1
  1. Calculate mean depth per variant
vcftools --vcf gwas/populations1.snps.vcf --site-mean-depth --out group1
  1. Calculate proportion of missing data per individual
vcftools --vcf gwas/populations1.snps.vcf --missing-indv --out group1
  1. Calculate proportion of missing data per variant
vcftools --vcf gwas/populations1.snps.vcf --missing-site --out group1

Visualize all files in R:

library(vcfR)
library(tidyverse)
library(ggpubr)

#Check vcf file
vcf <- read.vcfR("gwas/populations1.snps.vcf", verbose = FALSE)
show(vcf)
## ***** Object of Class vcfR *****
## 14 samples
## 26393 CHROMs
## 41,037 variants
## Object size: 49.8 Mb
## 10.45 percent missing data
## *****        *****         *****
#Allele frequency and Minimun allele frequency (MAF)
freq1 <- read_delim("gwas/group1.frq", delim = "\t", col_names = c("CHROM", "POS", "N_ALLELES", "N_CHR", "a1" ,"a2"), skip = 1)
#Create minor allele frequency 
freq1$maf <- freq1 %>% select(a1, a2) %>% apply(1, function(z) min(z))
summary(freq1$maf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03571 0.03846 0.07143 0.13036 0.16667 0.50000
f1 <- ggplot(freq1, aes(maf)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + ggtitle("Allele frquency")

#Depth per individual
depth1 <- read_delim("gwas/group1.idepth", delim = "\t")
g1 <- ggplot(depth1, aes(INDV,MEAN_DEPTH)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Depth per individual")

#Depth per variant
depth_var1 <- read_delim("gwas/group1.ldepth.mean", delim = "\t")
summary(depth_var1$MEAN_DEPTH)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.10   15.50   30.36   49.95   79.08  631.07
dv1 <- ggplot(depth_var1, aes(MEAN_DEPTH)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + theme_light() + xlim(0,300) + ggtitle("Depth per variant")

#Missing per individual
miss_ind1 <- read_delim("gwas/group1.imiss", delim = "\t")
mi1 <- ggplot(miss_ind1, aes(INDV, F_MISS)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Missing per individual")

#Missing per variant
miss_var1 <- read_delim("gwas/group1.lmiss", delim = "\t")
summary(miss_var1$F_MISS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.07143 0.10453 0.14286 0.28571
mv1 <- ggplot(miss_var1, aes(F_MISS)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + ggtitle("Missing per variant")


ggarrange(f1, g1, dv1, mi1, mv1, 
          ncol = 2, nrow = 3)

Analysis Group 2

  1. Calculate allele frequency
vcftools --vcf populations2.snps.vcf --freq2 --out group2 --max-alleles 2
vcftools --vcf populations2.snps.vcf --depth --out group2
vcftools --vcf populations2.snps.vcf --site-mean-depth --out group2
vcftools --vcf populations2.snps.vcf --missing-indv --out group2
vcftools --vcf populations2.snps.vcf --missing-site --out group2

Visualize all files in R:

library(vcfR)
library(tidyverse)
library(ggpubr)

#Check vcf file
vcf <- read.vcfR("gwas/populations2.snps.vcf", verbose = FALSE)
show(vcf)
## ***** Object of Class vcfR *****
## 14 samples
## 26958 CHROMs
## 42,274 variants
## Object size: 51.3 Mb
## 11.28 percent missing data
## *****        *****         *****
#Allele frequency and Minimun allele frequency (MAF)
freq2 <- read_delim("gwas/group2.frq", delim = "\t", col_names = c("CHROM", "POS", "N_ALLELES", "N_CHR", "a1" ,"a2"), skip = 1)
#Create minor allele frequency 
freq2$maf <- freq2 %>% select(a1, a2) %>% apply(1, function(z) min(z))
summary(freq2$maf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03571 0.03846 0.07692 0.13834 0.17857 0.50000
f2 <- ggplot(freq2, aes(maf)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + ggtitle("Allele frquency")

#Depth per individual
depth2 <- read_delim("gwas/group2.idepth", delim = "\t")
g2 <- ggplot(depth2, aes(INDV,MEAN_DEPTH)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Depth per individual")

#Depth per variant
depth_var2 <- read_delim("gwas/group2.ldepth.mean", delim = "\t")
summary(depth_var2$MEAN_DEPTH)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.90   16.36   31.00   50.00   78.09 1069.00
dv2 <- ggplot(depth_var2, aes(MEAN_DEPTH)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + theme_light() + xlim(0,300) + ggtitle("Depth per variant")

#Missing per individual
miss_ind2 <- read_delim("gwas/group2.imiss", delim = "\t")
mi2 <- ggplot(miss_ind2, aes(INDV, F_MISS)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Missing per individual")

#Missing per variant
miss_var2 <- read_delim("gwas/group2.lmiss", delim = "\t")
summary(miss_var2$F_MISS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.07143 0.11285 0.21429 0.28571
mv2 <- ggplot(miss_var2, aes(F_MISS)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3) + ggtitle("Missing per variant")


ggarrange(f2, g2, dv2, mi2, mv2, 
          ncol = 2, nrow = 3)

Filtering vcf file

Using vcftools to filter SNP data. We must adjust max-missing 1 to remove all missing data for further analysis

#Group 1
vcftools --vcf populations1.snps.vcf --max-missing 1 --min-meanDP 5 --max-meanDP 200 --minDP 5 --maxDP 5 --maxDP 200 --recode --out populations1.snps

# Group 2
vcftools --vcf populations2.snps.vcf --max-missing 1 --min-meanDP 5 --max-meanDP 200 --minDP 5 --maxDP 5 --maxDP 200 --recode --out populations2.snps

Checking filtered vcf on R

library(vcfR)

#Check vcf file
vcf1 <- read.vcfR("gwas/populations1.snps.recode.vcf", verbose = FALSE)
show(vcf1)
## ***** Object of Class vcfR *****
## 14 samples
## 5918 CHROMs
## 8,687 variants
## Object size: 12.4 Mb
## 0 percent missing data
## *****        *****         *****
vcf2 <- read.vcfR("gwas/populations2.snps.recode.vcf", verbose = FALSE)
show(vcf2)
## ***** Object of Class vcfR *****
## 14 samples
## 5188 CHROMs
## 7,553 variants
## Object size: 10.7 Mb
## 0 percent missing data
## *****        *****         *****

Prepare data for statistics

Group 1

library(vcfR)
library(tidyverse)

#Extract only genotyping and transform as numeric and transpose
gt1 <- as.data.frame(t(extract.gt(vcf1, element = "GT", as.numeric = TRUE)))
gt1 <- rownames_to_column(gt1, var = "id")

#Add phenotype
pop1 <- read.table("gwas/phenotype1.txt", col.names = c("id", "population", "region", "phenotype"))
pop1 <- pop1 %>% mutate(phenotype = case_when(
  phenotype == "NO_TOLERANT" ~ 0,
  phenotype == "THERM_TOLERANT" ~ 1
))

#Add phenotype to gt
data1 <- merge(pop1, gt1, by = "id")
write.table(data1, file = "gwas/snpdata_group1.txt", sep = "\t", row.names = FALSE)

Group 2

library(vcfR)
library(tidyverse)

#Extract only genotyping and transform as numeric (0 or 1) and transpose
gt2 <- as.data.frame(t(extract.gt(vcf2, element = "GT", as.numeric = TRUE)))
gt2 <- rownames_to_column(gt2, var = "id")

#Add phenotype
pop2 <- read.table("gwas/phenotype2.txt", col.names = c("id", "population", "region", "phenotype"))
pop2 <- pop2 %>% mutate(phenotype = case_when(
  phenotype == "NO_TOLERANT" ~ 0,
  phenotype == "THERM_TOLERANT" ~ 1
))

#Add phenotype to gt
data2 <- merge(pop2, gt2, by = "id")
write.table(data2, file = "gwas/snpdata_group2.txt", sep = "\t", row.names = FALSE)

Statistics

Testing correlation between each SNP and phenotype (therm-tolerant or not). Both data, phenotype and SNPs are binary, so we used the Binary similarity coefficients to test the correlation to tolerance. When the correlation to tolerance is close to 1, the similarity between the SNP and the phenotype is total, when close to 0, the SNP is neutral, when close to -1, the similarity is total opposite. We used Phi method for binary similarity coefficient (Yule GU. 1912. On the methods of measuring association between two attributes. J. Royal Stat. Soc. 75: 579–642.)

\[ S_{ij} = \frac{(a*d-b*c)}{\sqrt{(a+b)*(a+c)*(b+d)*(c+d)}} \]

Group 1

library(data.table)

## Loading and checking data
dat1 <- fread("gwas/snpdata_group1.txt", drop = c(2,3))
# drop population and region columns


# Remove all SNPs that are 0 or 1 for all individual. They are not informative.
colsel <- apply(dat1, 2, function(x) all(x == 0) | all(x == 1))
dat1 <- dat1[, !colsel, with = FALSE]
rm(colsel)

# SNPs names
snps1 <- names(dat1)[-(1:2)]

## Coefficient of similarity
source("gwas/binary_similarity_coefficients.R")

sims1 <- sapply(snps1, function(x) similarity(dat1$phenotype, dat1[[x]], "phi"))

#x <- as.data.table(sims, TRUE)[abs(sims) > 0.5]

p1 <- ggplot(as.data.table(sims1, TRUE)[abs(sims1) > .5],
       aes(reorder(rn, abs(sims1)), sims1)) +
       geom_col(width = .2) + geom_point(aes(color = sims1), size = 4) +
       scale_color_gradient2(low = '#533100', mid = 'gray60', high = '#03655D', guide = "none") +
       theme_minimal() +
       theme(axis.text.y = element_text(face = "bold")) +
       coord_flip() +
       labs(x = NULL, y = "correlation to tolerance",
            caption = "only SNPs with correlation greater than ±0.5") +
       ggtitle("Group 1")

Group 2

library(data.table)

## Loading and checking data
dat2 <- fread("gwas/snpdata_group2.txt", drop = c(2,3))
# drop population and region columns


# Remove all SNPs that are 0 or 1 for all individual. They are not informative.
colsel <- apply(dat2, 2, function(x) all(x == 0) | all(x == 1))
dat2 <- dat2[, !colsel, with = FALSE]
rm(colsel)

# SNPs names
snps2 <- names(dat2)[-(1:2)]

## Coefficient of similarity
source("gwas/binary_similarity_coefficients.R")

sims2 <- sapply(snps2, function(x) similarity(dat1$phenotype, dat2[[x]], "phi"))

#x <- as.data.table(sims, TRUE)[abs(sims) > 0.5]

p2 <- ggplot(as.data.table(sims2, TRUE)[abs(sims2) > .5],
       aes(reorder(rn, abs(sims2)), sims2)) +
       geom_col(width = .2) + geom_point(aes(color = sims2), size = 4) +
       scale_color_gradient2(low = '#533100', mid = 'gray60', high = '#03655D', guide = "none") +
       theme_minimal() +
       theme(axis.text.y = element_text(face = "bold")) +
       coord_flip() +
       labs(x = NULL, y = "correlation to tolerance",
            caption = "only SNPs with correlation greater than ±0.5") +
       ggtitle("Group 2")

ggarrange(p1, p2, 
          ncol = 2, nrow = 1)

There are no SNPs with total similarity with phenotype (S = 1) or completely opposite (S = 0). We determined the similarities ±0.5 would be potential SNPs related or not to the phenotype.

DONE!